library(dplyr)
library(ggplot2)
library(DT)
library(here)2 Model Validation
2.1 Recap of tutorial 1
Understand random variable through the following simulation:
n_sample <- 1000
beta0 <- 0.5
beta1 <- 2
x <- rnorm(n_sample,mean = 5, sd = 10)
random_error <- rnorm(n_sample,mean = 0, sd = 5)
y <- beta0 + beta1*x + random_error
pop_dat <- data.frame(y = y,
x = x)
pop_dat %>%
ggplot(aes(x = x, y = y))+
geom_point()+
theme_bw()observed standard deviation of the data:
pop_dat %>%
ggplot(aes(y))+
geom_histogram()+
theme_bw()`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
sd(pop_dat$y)[1] 20.08596
sample_size <- 100
sample_dat <- pop_dat[sample(c(1:sample_size),replace = FALSE),]
sample_dat %>% datatable()sample_dat %>%
ggplot(aes(x = x, y = y))+
geom_point()+
theme_bw()the standard deviation of the estimator (\(\hat\beta\)):
glm_m <- glm(y~x, data= sample_dat)
set.seed(1017)
## lapply parallel computing: more effecient than a for loop:)
coef_dat <- lapply(1:100, function(i){
sample_dat <- pop_dat[sample(100,replace = FALSE),]
glm_m <- glm(y~x, data= sample_dat)
coef(glm_m)[2]
})
coef_dat %>% head()[[1]]
x
1.94098
[[2]]
x
1.94098
[[3]]
x
1.94098
[[4]]
x
1.94098
[[5]]
x
1.94098
[[6]]
x
1.94098
set.seed(1017)
beta_hat_dat <- lapply(1:100, function(i) {
sample_dat <- pop_dat[sample(seq_len(nrow(pop_dat)), ## sample from 1: n_sample
sample_size, ## sample size
replace = FALSE), ]
glm_m <- glm(y ~ x, data = sample_dat)
coef(glm_m)[2]
}) %>% unlist()
beta_hat_dat %>%
data.frame() %>%
ggplot(aes(.)) +
geom_histogram()`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
beta_hat <- NULL
for(sample_size in c(20,100,200,500,1000)){
beta_hat_tmp <- lapply(1:100, function(i) {
sample_dat <- pop_dat[sample(seq_len(nrow(pop_dat)), ## sample from 1: n_sample
sample_size, ## sample size
replace = FALSE), ]
glm_m <- glm(y ~ x, data = sample_dat)
coef(glm_m)[2]
}) %>% unlist()
beta_hat <- cbind(beta_hat,beta_hat_tmp )
}
beta_hat <- beta_hat %>% data.frame()
colnames(beta_hat) <- paste0("sample size = ",c(20,100,200,500,1000))
beta_hat %>%
datatable()apply(beta_hat,2, sd) sample size = 20 sample size = 100 sample size = 200 sample size = 500
1.331442e-01 4.579778e-02 3.317768e-02 1.444121e-02
sample size = 1000
2.500723e-15
viz the distribution of \(\hat \beta\):
library(tidyr)Warning: package 'tidyr' was built under R version 4.3.2
plot_dat <- beta_hat %>%
pivot_longer(cols = everything(),
values_to = "beta_hat",
names_to = "sample size",
names_prefix = "sample size ="
)
plot_dat <- plot_dat %>% mutate(`sample size` = as.numeric(`sample size`))
plot_dat$`sample size` <- factor(plot_dat$`sample size`,
levels = c("20","100", "200", "500", "1000"),
labels = c("20","100", "200", "500", "1000"))
plot_dat %>%
ggplot(aes(x = beta_hat,color = `sample size`, fill = `sample size`)) +
geom_histogram()+
facet_grid(cols = vars(`sample size`),scales = "free_y")+
theme_bw()+
theme(legend.position = "bottom")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
2.2 Tutorial 2: Model Validation
Use the hwy2.RData file available from the course page. After attaching the rms package and doing the usual datadist()/options() task
1. Fit the following model
fit <- ols(rate~rcs(trks,4)+rcs(sigs1,4)+type+hwy,data=hwy2,x=TRUE,y=TRUE)- run both the ordinary bootstrap validation and the .632 bootstrap validation on this model. compare the results.
rm(list = ls())
library(rms)
library(dplyr)
## change your working directory as necessary
load(here("data","hwy2.RData"))
str(hwy2)'data.frame': 39 obs. of 13 variables:
$ rate : num 4.58 2.86 3.02 2.29 1.61 6.87 3.85 6.12 3.29 5.88 ...
$ len : num 4.99 16.11 9.75 10.65 20.01 ...
$ ADT : int 69 73 49 61 28 30 46 25 43 23 ...
$ trks : int 8 8 10 13 12 6 8 9 12 7 ...
$ sigs1: num 0.2004 0.0621 0.1026 0.0939 0.05 ...
$ slim : int 55 60 60 65 70 55 55 55 50 50 ...
$ shld : int 10 10 10 10 10 10 8 10 4 5 ...
$ lane : int 8 4 4 6 4 4 4 4 4 4 ...
$ acpt : num 4.6 4.4 4.7 3.8 2.2 24.8 11 18.5 7.5 8.2 ...
$ itg : num 1.2 1.43 1.54 0.94 0.65 0.34 0.47 0.38 0.95 0.12 ...
$ lwid : int 12 12 12 12 12 12 12 12 12 12 ...
$ hwy : Factor w/ 4 levels "FAI","MA","MC",..: 1 1 1 1 1 4 4 4 4 4 ...
$ type : Factor w/ 2 levels "regular","major": 2 2 2 2 2 2 2 2 2 2 ...
When working with factor/nominal/categorical variables in R, preprocessing is necessary to ensure the data is properly handled by statistical models and algorithms. Many R functions automatically perform dummy coding when a variable is specified as a factor, typically using the as.factor() function.
Side note: Dummy coding (sometimes confused with one-hot encoding in CS) is just one way to code factor variables. It is popular because of its ease of interpretation.
A complete process of model validation usually consists of two parts: internal and external validation. Ideally (when you’re lucky 😄), you will have two independent datasets, such as EHR data from two hospitals. You will first build or ( train) and validate your model internally using data from hospital A (the train-validation dataset). Then the model can be sent to hospital B for external validation (test set). If you do not have two datasets, you can manually divide the data into two parts, with one part mimicking the external dataset.
Split-sample or split-sample averaged(SSA)
K-fold Cross-validation
Bootstrap: ordinary, .632
# sample size 100k
set.seed(1017)
n_sample <- 100000
patient_id <- c(1:n_sample)
# sample with replacement
sample_dat <- sample(patient_id,size=n_sample,replace = TRUE)
# proportion of samples selected in bootstrap sampling with replacement
((sample_dat%>% unique() %>% length())/n_sample )%>% round(3)[1] 0.632
h.dd <- datadist(hwy2)
options(datadist="h.dd")
fit <- ols(rate~rcs(trks,4)+
rcs(sigs1,4)+type+hwy,
data=hwy2,x=TRUE,y=TRUE)
fit$x %>% head() trks trks' trks'' sigs1 sigs1' sigs1'' type=major
1 8 0.02777778 0.0000000 0.20040080 2.023540e-03 0.0004753511 1
2 8 0.02777778 0.0000000 0.06207325 7.616807e-07 0.0000000000 1
3 10 0.75000000 0.2222222 0.10256410 8.221905e-05 0.0000000000 1
4 13 4.50000000 2.2222222 0.09389671 4.716474e-05 0.0000000000 1
5 12 3.02777778 1.4074074 0.04997501 0.000000e+00 0.0000000000 1
6 6 0.00000000 0.0000000 2.00750419 1.016433e+00 0.7779110205 1
hwy=MA hwy=MC hwy=PA
1 0 0 0
2 0 0 0
3 0 0 0
4 0 0 0
5 0 0 0
6 0 0 1
set.seed(1017)
validate(fit, B=100)
Divergence or singularity in 14 samples
index.orig training test optimism index.corrected n
R-square 0.7110 0.7542 0.5195 0.2346 0.4764 86
MSE 1.1106 0.8411 1.8466 -1.0055 2.1161 86
g 1.8604 1.7997 1.6890 0.1107 1.7497 86
Intercept 0.0000 0.0000 0.4525 -0.4525 0.4525 86
Slope 1.0000 1.0000 0.8944 0.1056 0.8944 86
set.seed(1017)
validate(fit,method = ".632",B=100)
Divergence or singularity in 14 samples
index.orig training test optimism index.corrected n
R-square 0.7110 0.7542 -0.0506 0.4814 0.2297 86
MSE 1.1106 0.8411 2.8875 -1.1230 2.2336 86
g 1.8604 1.7997 1.1931 0.4217 1.4386 86
Intercept 0.0000 0.0000 0.9201 -0.5815 0.5815 86
Slope 1.0000 1.0000 0.6309 0.2332 0.7668 86
Bootstrapping with replacement leads to some data points being repeated in the bootstrap sample. As a result, the model may end up ‘memorizing’ these repeated points, which can cause it to perform better.